Dataset - First Exploration

library(readr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
colombia_covid <- read_csv("colombia_covid.csv")
## Parsed with column specification:
## cols(
##   `ID de caso` = col_double(),
##   `Fecha de diagnóstico` = col_character(),
##   `Ciudad de ubicación` = col_character(),
##   `Departamento o Distrito` = col_character(),
##   `Atención**` = col_character(),
##   Edad = col_double(),
##   Sexo = col_character(),
##   `Tipo*` = col_character(),
##   `País de procedencia` = col_character()
## )
unique(colombia_covid$`Departamento o Distrito`)
##  [1] "Bogotá D.C."           "Valle del Cauca"       "Antioquia"            
##  [4] "Cartagena D.T. y C"    "Huila"                 "Meta"                 
##  [7] "Risaralda"             "Norte de Santander"    "Caldas"               
## [10] "Cundinamarca"          "Barranquilla D.E."     "Santander"            
## [13] "Quindío"               "Tolima"                "Cauca"                
## [16] "Santa Marta D.T. y C." "Cesar"                 "San Andrés"           
## [19] "Casanare"              "Nariño"                "Atlántico"            
## [22] "Boyacá"                "Córdoba"               "Bolívar"              
## [25] "Sucre"                 "La Guajira"
colombia_covid %>%
  group_by(`Departamento o Distrito`) %>%
  summarise
## # A tibble: 26 x 1
##    `Departamento o Distrito`
##    <chr>                    
##  1 Antioquia                
##  2 Atlántico                
##  3 Barranquilla D.E.        
##  4 Bogotá D.C.              
##  5 Bolívar                  
##  6 Boyacá                   
##  7 Caldas                   
##  8 Cartagena D.T. y C       
##  9 Casanare                 
## 10 Cauca                    
## # … with 16 more rows
#lat-long
#bogota<c(4.592164298, -74.072166378, 542)
valle_cauca<-c(3.359889, -76.638565, 162) #cauca è lo stesso 
antioquia<-c(6.230833, -75.590553, 127)
cartagena<-c(10.39972, -75.51444, 39)
huila<-c(2.916112, -75.283440, 30)
meta<-c(4.151382, -73.637688, 12)
risaralda<-c(4.8133302,  -75.6961136, 35)
norte_santander<-c(7.916663, -72.666664, 21)
caldas<-c(5.070275, -75.513817, 16)
cudinamarca<-c(4.862437, -74.058655, 42)
barraquilla<-c(10.9583295, -74.791163502, 35) #atlantico
santader<-c(7.12539, -73.1198, 12)
quindio<-c(4.535000, -75.675690, 23)
tolima<-c(4.43889, -75.23222, 14)
santa_marta<-c(11.24079, -74.19904, 12)
cesar<-c(10.46314, -73.25322, 16)
san_andres<-c(12.542499, -81.718369, 2)
casanare<-c(5.33775, -72.39586, 2)
narino<-c(1.21361, -77.28111, 6)
boyaca<-c(5.767222, -72.940651, 6)
cordoba<-c(8.74798, -75.88143, 2)
bolivar<-c(4.3387, -76.18342, 3)
sucre<-c(8.81136, -74.72084, 1)
guajira<-c(7.370165186, -76.7088304, 1)

gis_data<-data.frame(latitude=4.624335, longitude=-74.063644, cases=542) #bogotà
gis_data<-rbind(gis_data, valle_cauca)
gis_data<-rbind(gis_data, antioquia)
gis_data<-rbind(gis_data, cartagena)
gis_data<-rbind(gis_data, huila)
gis_data<-rbind(gis_data, meta)
gis_data<-rbind(gis_data, risaralda)
gis_data<-rbind(gis_data, norte_santander)
gis_data<-rbind(gis_data, caldas)
gis_data<-rbind(gis_data, cudinamarca)
gis_data<-rbind(gis_data, barraquilla)
gis_data<-rbind(gis_data, santader)
gis_data<-rbind(gis_data, quindio)
gis_data<-rbind(gis_data, tolima)
gis_data<-rbind(gis_data, santa_marta)
gis_data<-rbind(gis_data, cesar)
gis_data<-rbind(gis_data, san_andres)
gis_data<-rbind(gis_data, casanare)
gis_data<-rbind(gis_data, narino)
gis_data<-rbind(gis_data, boyaca)
gis_data<-rbind(gis_data, cordoba)
gis_data<-rbind(gis_data, bolivar)
gis_data<-rbind(gis_data, sucre)
gis_data<-rbind(gis_data, guajira)

gis_data
##     latitude longitude cases
## 1   4.624335 -74.06364   542
## 2   3.359889 -76.63856   162
## 3   6.230833 -75.59055   127
## 4  10.399720 -75.51444    39
## 5   2.916112 -75.28344    30
## 6   4.151382 -73.63769    12
## 7   4.813330 -75.69611    35
## 8   7.916663 -72.66666    21
## 9   5.070275 -75.51382    16
## 10  4.862437 -74.05866    42
## 11 10.958329 -74.79116    35
## 12  7.125390 -73.11980    12
## 13  4.535000 -75.67569    23
## 14  4.438890 -75.23222    14
## 15 11.240790 -74.19904    12
## 16 10.463140 -73.25322    16
## 17 12.542499 -81.71837     2
## 18  5.337750 -72.39586     2
## 19  1.213610 -77.28111     6
## 20  5.767222 -72.94065     6
## 21  8.747980 -75.88143     2
## 22  4.338700 -76.18342     3
## 23  8.811360 -74.72084     1
## 24  7.370165 -76.70883     1
library(ggplot2)
library(leaflet)
library(geojsonio)
## 
## Attaching package: 'geojsonio'
## The following object is masked from 'package:base':
## 
##     pretty
getColor <- function(gis_data) {
  sapply(gis_data$cases, function(cases) {
  if(cases <= 10) {
    "green"
  } else if(cases <= 100) {
    "orange"
  } else {
    "red"
  } })
}

icons <- awesomeIcons(
  icon = 'ios-close',
  iconColor = 'black',
  library = 'ion',
  markerColor = getColor(gis_data)
)



dept<-geojsonio::geojson_read("Colombia.geo.json", what="sp")

labels <- sprintf(
  "<strong>%s</strong><br/>",
  dept$NOMBRE_DPT
) %>% lapply(htmltools::HTML)

m <- leaflet(data=gis_data) %>% addTiles() %>%
  addAwesomeMarkers(~longitude, ~latitude, icon=icons, label = ~as.character(cases)) %>%
  addProviderTiles(providers$CartoDB.Positron) %>%
  addPolygons(data=dept,     
              opacity = 0.2,
              color = "yellow",
              dashArray = "3",
              fillOpacity = 0.1,
              highlight = highlightOptions(
                weight = 5,
                color = "#666",
                dashArray = "",
                fillOpacity = 0.7,
                bringToFront = TRUE),
                label = labels)
m

Reading here and there I found that Colombia in divided in 5 regions, the central one comprises: Bayacà, Tolima, Cundinamarca, Meta, Bogotà DC.

However, since in our assignment Colombia is divided in 3 parts, I think that we should add some more regions (e.g. Quindio, Valle de Cauca, Risalda, Celdas, Bayaca and possibly Antioquia and Santander)

central.colombia.dep<-c("Bogotá D.C.", "Tolima", "Cundinamarca", "Meta", "Boyacá", "Quindío", "Valle del Cauca", "Risaralda", "Caldas", "Boyacá", "Antioquia", "Santander")

central.colombia.rows<-which(colombia_covid$`Departamento o Distrito` %in% central.colombia.dep)

colombia_covid<-colombia_covid[central.colombia.rows,]
colombia_covid
## # A tibble: 979 x 9
##    `ID de caso` `Fecha de diagn… `Ciudad de ubic… `Departamento o… `Atención**`
##           <dbl> <chr>            <chr>            <chr>            <chr>       
##  1            1 06/03/2020       Bogotá           Bogotá D.C.      Recuperado  
##  2            2 09/03/2020       Buga             Valle del Cauca  Recuperado  
##  3            3 09/03/2020       Medellín         Antioquia        Recuperado  
##  4            4 11/03/2020       Medellín         Antioquia        Recuperado  
##  5            5 11/03/2020       Medellín         Antioquia        Recuperado  
##  6            6 11/03/2020       Itagüí           Antioquia        Casa        
##  7            8 11/03/2020       Bogotá           Bogotá D.C.      Recuperado  
##  8            9 11/03/2020       Bogotá           Bogotá D.C.      Recuperado  
##  9           10 12/03/2020       Bogotá           Bogotá D.C.      Recuperado  
## 10           11 12/03/2020       Bogotá           Bogotá D.C.      Casa        
## # … with 969 more rows, and 4 more variables: Edad <dbl>, Sexo <chr>,
## #   `Tipo*` <chr>, `País de procedencia` <chr>

Some very basics plots

Let’s check the situation (and also the power of ggplot)!

Scattered infos about pandemic in Colombia (https://en.wikipedia.org/wiki/COVID-19_pandemic_in_Colombia):

*the quarantine started on the 20th of March, since our data are from 6th of March to 2nd of April, it is very likeliy that quarantine effects are not witnessed in our data;

*on march the 26th there was a damage in the machine that prepared the samples for processing and subsequent diagnosis of COVID-19, this could explain the very low number of confirmed cases;

theme_set(theme_classic())
region<-colombia_covid$`Departamento o Distrito`
nrows<-10
df <- expand.grid(y = 1:nrows, x = 1:nrows)
categ_table <- round(table(region) * ((nrows*nrows+1)/(length(region))))
df$category<-factor(rep(names(categ_table), categ_table))
waffle_chart <- ggplot(df, aes(x = x, y = y, fill = df$category)) + 
        geom_tile(color = "black", size = 0.5) +
        scale_x_continuous(expand = c(0, 0)) +
        scale_y_continuous(expand = c(0, 0), trans = 'reverse') +
        scale_fill_brewer(palette = "Set3") +
        labs(title="Frequency of cases across Departments", subtitle="Waffle Chart") + 
        theme(#panel.border = element_rect(size = 2),
              plot.title = element_text(size = rel(1.2)),
              axis.text = element_blank(),
              axis.title = element_blank(),
              axis.ticks = element_blank(),
              legend.title = element_blank(),
              legend.position = "right")
waffle_chart

The major number of cases are in the capital Bogotà.

theme_set(theme_classic())
#number of cases confirmed day by day

#fix day column in "international" format so that R can fix the sorting properly
colombia_covid$`Fecha de diagnóstico`<-as.Date(colombia_covid$`Fecha de diagnóstico`, format="%d/%m/%Y")

dayli_confirmed<-ggplot(colombia_covid, aes(x = `Fecha de diagnóstico`)) +
  scale_fill_brewer(palette = "Spectral")
dayli_confirmed + geom_histogram(aes(fill=`Departamento o Distrito`), width = 0.8, stat="count") + theme(axis.text.x = element_text(angle=65, vjust=0.6)) +
  labs(title = "Daily number of confirmed cases", 
       subtitle = "subdivided across departments",
       x = "Date of confirmation",
       fill = "Department")

The previous plot represents the daily incidence of the desease across all the departments we are taking into account.

Let’s check the general trend by looking at the cumulative number of confirmed cases (again, all “our” departments are taken into account):

#the max of the ID de caso for each date is the cumulative number of cases confirmed up to that date
cumulative<- as.data.frame(colombia_covid %>%
  group_by(`Fecha de diagnóstico`) %>%
  summarise(max(`ID de caso`)))
  
names(cumulative)<-c("Fecha de diagnóstico", "Cumulative confirmed")
cumulative
##    Fecha de diagnóstico Cumulative confirmed
## 1            2020-03-06                    1
## 2            2020-03-09                    3
## 3            2020-03-11                    9
## 4            2020-03-12                   11
## 5            2020-03-13                   16
## 6            2020-03-14                   24
## 7            2020-03-15                   45
## 8            2020-03-16                   57
## 9            2020-03-17                   75
## 10           2020-03-18                  102
## 11           2020-03-19                  128
## 12           2020-03-20                  175
## 13           2020-03-21                  210
## 14           2020-03-22                  240
## 15           2020-03-23                  306
## 16           2020-03-24                  419
## 17           2020-03-25                  481
## 18           2020-03-26                  491
## 19           2020-03-27                  539
## 20           2020-03-28                  603
## 21           2020-03-29                  700
## 22           2020-03-30                  798
## 23           2020-03-31                  906
## 24           2020-04-01                 1065
## 25           2020-04-02                 1161
cumulative_confiremd<-ggplot(cumulative, aes(x=`Fecha de diagnóstico`, y=`Cumulative confirmed`))

cumulative_confiremd + geom_point(size=3) +
  geom_segment(aes(x=`Fecha de diagnóstico`,
                   xend=`Fecha de diagnóstico`,
                   y=0,
                   yend=`Cumulative confirmed`)) +
  labs(title = "Cumulative number of confirmed cases",
       x = "Date of confirmation") +
  theme(axis.text.x = element_text(angle=65, vjust=0.6))

Here the growth seems exponential (and this is consistent with the fact that we are studying the early stages of the outbreak).

In order to confirm it we should fit a log-linear model, and check that it produces a constant growth rate (straight line).

Now let’s explore the distribution of cases across genders and age:

library(ggthemes)
brks <- seq(-250, 250, 50)
lbls <- as.character(c(seq(-250, 0, 50), seq(50, 250, 50)))

ggplot(data=colombia_covid, aes(x=`Departamento o Distrito`, fill = Sexo)) +  
                              geom_bar(data = subset(colombia_covid, Sexo == "F")) +
                              geom_bar(data = subset(colombia_covid, Sexo == "M"), aes(y=..count..*(-1))) + 
                              scale_y_continuous(breaks = brks,
                                               labels = lbls) + 
                              coord_flip() +  
                              labs(title="Spread of the desease across genders",
                                   y = "Number of cases",
                                   x = "Department",
                                   fill = "Gender") +
                              theme_tufte() +  
                              theme(plot.title = element_text(hjust = .5), 
                                    axis.ticks = element_blank()) +   
                              scale_fill_brewer(palette = "Dark3")  

Maybe in order to study the distribution of ages we should divide the ages in groups, for example 0-18, 18-30, 30-45, 45-60, 60-75, 75+.

#create column age_group (didn't modify the original dataset though)
age_groups<-colombia_covid %>% mutate(age_group = case_when(Edad <= 18 ~ '0-18',
                                             Edad >= 19  & Edad <= 30 ~ '19-30',
                                             Edad >=  31 & Edad <= 45 ~ '31-45',
                                             Edad >= 46 & Edad <= 60 ~ '46-60',
                                             Edad >=61 & Edad <= 75 ~ '60-75',
                                             Edad >=76 ~ '76+'))

#compute percentage so that we can label more precisely the pie chart
age_groups_pie <- age_groups %>% 
  group_by(age_group) %>%
  count() %>%
  ungroup() %>%
  mutate(per=`n`/sum(`n`)) %>% 
  arrange(desc(age_group))
age_groups_pie$label <- scales::percent(age_groups_pie$per)

library(ggrepel)

age_pie <- ggplot(age_groups_pie, aes(x = "", y = per, fill = factor(age_group))) + 
  geom_bar(stat="identity", width = 1) +
  theme(axis.line = element_blank(), 
        plot.title = element_text(hjust=0.5)) + 
  labs(fill="Age groups", 
       x=NULL, 
       y=NULL, 
       title="Distribution of the desease across ages") +
  coord_polar(theta = "y") +
  #geom_text(aes(x=1, y = cumsum(per) - per/2, label=label))
  geom_label_repel(aes(x=1, y=cumsum(per) - per/2, label=label), size=3, show.legend = F, nudge_x = 0) +
  guides(fill = guide_legend(title = "Group"))
  
age_pie 

This is quite surprising.. I expected elder people to be more affected by Covid-19!

Now we can analyze jointly the distribution of age and sex (sex distribution across group of age):

#library(ggpol)

keep <- c("Sexo", "age_group")
age_groups<-age_groups[names(age_groups)%in%keep]
age_groups_count<-aggregate(cbind(pop=Sexo)~age_group+Sexo,
                      data=age_groups,
                      FUN = function(x){NROW(x)})
age_groups_count$count <- ifelse(age_groups_count$Sexo == "F", age_groups_count$pop * -1, age_groups_count$pop)
age_groups_count<-as.data.frame(age_groups_count[names(age_groups_count)!="pop"])
age_groups_count
##    age_group Sexo count
## 1       0-18    F   -20
## 2      19-30    F  -119
## 3      31-45    F  -149
## 4      46-60    F  -113
## 5      60-75    F   -69
## 6        76+    F   -11
## 7       0-18    M    23
## 8      19-30    M   119
## 9      31-45    M   154
## 10     46-60    M   124
## 11     60-75    M    61
## 12       76+    M    17
ggplot(age_groups_count, aes(x=age_group, y=count, fill=Sexo)) +
  geom_col() + 
  #facet_share(~Sexo, dir = "h") +
  coord_flip(clip="off") +
  theme_minimal() +
  labs(title = "Distribution of sex by age",
       y = "",
       x = "Age group")

There isn’t much difference between the sexes among the different group of ages, I have the impression that the covariates present in the dataset won’t help us!! :(

We are now left to explore the Tipo variable:

theme_set(theme_classic())
#renamed the attribute since I had sime problem due to the presence of *
colnames(colombia_covid)[8]<-"tipo"

type<-ggplot(colombia_covid, aes(x = `Fecha de diagnóstico`)) +
  scale_fill_brewer(palette = "Set3")
type + geom_histogram(aes(fill=tipo), width = 0.8, stat="count") + theme(axis.text.x = element_text(angle=65, vjust=0.6)) +
  labs(title = "Daily number of confirmed cases", 
       subtitle = "subdivided across type",
       x = "Date of confirmation",
       fill = "Type")

I think that en estudio means that it is not clear while the case is imported or not, however it seems like there are more imported cases, we can count them:

type_pie<- colombia_covid %>% 
  group_by(tipo) %>%
  count() %>%
  ungroup() %>%
  mutate(per=`n`/sum(`n`)) %>% 
  arrange(desc(tipo))
type_pie$label <- scales::percent(type_pie$per)
type_pie<-type_pie[names(type_pie)!="per"]
colnames(type_pie)<-c("tipo", "total_number", "percentage")
type_pie
## # A tibble: 3 x 3
##   tipo        total_number percentage
##   <chr>              <int> <chr>     
## 1 Relacionado          281 28.7%     
## 2 Importado            465 47.5%     
## 3 En estudio           233 23.8%

Almost half of the total confirmed cases in our region of interest are imported, and a significant percentage is anknown wheter it is imported or not. Again this is in some sense interesting, but I don’t see clearly why this should be helpful in our model!

imported<-subset(colombia_covid, tipo=="Importado", select = c("País de procedencia"))
imported %>%
  group_by(`País de procedencia`) %>%
  count() 
## # A tibble: 55 x 2
## # Groups:   País de procedencia [55]
##    `País de procedencia`     n
##    <chr>                 <int>
##  1 0                         1
##  2 Alemania                  4
##  3 Alemania - Estambul       1
##  4 Arabia                    1
##  5 Argentina                 2
##  6 Aruba                     2
##  7 Bélgica                   1
##  8 Brasil                   10
##  9 Canadá                    1
## 10 Chile                     2
## # … with 45 more rows

here data are a bit dirty, however I don’t know if the effort of cleaning them will worth the result.. it depends wheter we decide to use this info in our analysis

Missing

I still didn’t integrate the “other part” of the dataset, the one concerning deaths!

Ideas

For what concerns the predictive model we want to build, I think that we should start by something very simple (e.g. a (log)linear model) and take it as a baseline.

Then we build something more complex (such as a hierarchical model) and see the improvements with respect to the baseline.

If possible I would put inside something bayesian, since I understood that they really like this kind of stuff!